#!/bin/bash


XINFERENCE_MODEL_SRC=modelscope XINFERENCE_HOME=/xinference/xinference_cache xinference-local -H 0.0.0.0 &
while true; do
  if curl -s "http://localhost:9997" > /dev/null; then
    break
  else
    sleep 1
  fi
done


xinference launch --model-engine vllm --model-name qwen2.5-vl-instruct --size-in-billions 32 --model-format pytorch --model_path /home/weights/Qwen2.5-VL-32B-Instruct --n-gpu 4 --replica 4 --max_model_len $[1024*8] --trust-remote-code 1

PID1=$!
wait $PID1
wait


